import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from string import punctuation
from wordcloud import WordCloud, STOPWORDS
import plotly.express as px
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
df = pd.read_csv('TwitterHate.csv')
df.sample(10)
| id | label | tweet | |
|---|---|---|---|
| 22649 | 22650 | 0 | #prayersfororlando #50dead#53injured day |
| 6594 | 6595 | 0 | off to bracebridge, huntsville to look at more... |
| 7186 | 7187 | 0 | .@user on #periscope: rant!!! orlando, florida... |
| 2711 | 2712 | 0 | i hope you all have a #day and do not let th... |
| 4162 | 4163 | 0 | when my depament chair says i have good abilit... |
| 28037 | 28038 | 0 | @user please do something, anything about thes... |
| 14566 | 14567 | 0 | #babies graco fastaction dlx travel system -... |
| 15190 | 15191 | 0 | not acquainted with #punjabi culture, #kejriwa... |
| 29639 | 29640 | 0 | @user odd that you keep saying that you aren'... |
| 24956 | 24957 | 0 | @user 10 best adam sandler movies ever [ #gil... |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 31962 entries, 0 to 31961 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 31962 non-null int64 1 label 31962 non-null int64 2 tweet 31962 non-null object dtypes: int64(2), object(1) memory usage: 749.2+ KB
import re
def clean_text(text):
text = text.lower()
text = re.sub("@[A-Za-z0-9]+",'',text)
re.sub("http://[A-Za-z0-9]+@[A-Za-z0-9]+.com",'',text)
re.sub("https://[A-Za-z0-9]+@[A-Za-z0-9]+.com",'',text)
return text
df['tweet'] = df['tweet'].apply(lambda x: clean_text(x))
df.head(5)
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | when a father is dysfunctional and is so sel... |
| 1 | 2 | 0 | thanks for #lyft credit i can't use cause th... |
| 2 | 3 | 0 | bihday your majesty |
| 3 | 4 | 0 | #model i love u take with u all the time in ... |
| 4 | 5 | 0 | factsguide: society now #motivation |
tk = TweetTokenizer()
df['tweet'] = df['tweet'].apply(lambda w: tk.tokenize(str(w)))
df.head(5)
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | [when, a, father, is, dysfunctional, and, is, ... |
| 1 | 2 | 0 | [thanks, for, #lyft, credit, i, can't, use, ca... |
| 2 | 3 | 0 | [bihday, your, majesty] |
| 3 | 4 | 0 | [#model, i, love, u, take, with, u, all, the, ... |
| 4 | 5 | 0 | [factsguide, :, society, now, #motivation] |
stop_words = set(stopwords.words('english'))
df['tweet'] = df['tweet'].apply(lambda w: [w for w in w if not w.lower() in stop_words])
df.head(5)
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | [father, dysfunctional, selfish, drags, kids, ... |
| 1 | 2 | 0 | [thanks, #lyft, credit, can't, use, cause, off... |
| 2 | 3 | 0 | [bihday, majesty] |
| 3 | 4 | 0 | [#model, love, u, take, u, time, urð, , , ±,... |
| 4 | 5 | 0 | [factsguide, :, society, #motivation] |
df['tweet'] = df['tweet'].apply(lambda w: [w for w in w if w not in ['2/2','1/2','x91','x92','x98','x9f','x98ð','x99ð','x8a','x8f','x80','x93','x99s','x99','x8c','x80ï','x8aâ','x8fð','x9d','x8eð','x84ð','x85ð','x8cð','x93ð','x8d','x8e','x9c','x94','x8dð','x82','x86','x96','x96ð','x82ð','..','...']])
df.head(5)
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | [father, dysfunctional, selfish, drags, kids, ... |
| 1 | 2 | 0 | [thanks, #lyft, credit, can't, use, cause, off... |
| 2 | 3 | 0 | [bihday, majesty] |
| 3 | 4 | 0 | [#model, love, u, take, u, time, urð, , , ±,... |
| 4 | 5 | 0 | [factsguide, :, society, #motivation] |
def remove_char(words):
words_w = []
for w in words:
if '#'in w:
words_w.append(w[1:])
else:
words_w.append(w)
return words_w
df['tweet'] = df['tweet'].apply(lambda w : remove_char(w))
df.head()
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | [father, dysfunctional, selfish, drags, kids, ... |
| 1 | 2 | 0 | [thanks, lyft, credit, can't, use, cause, offe... |
| 2 | 3 | 0 | [bihday, majesty] |
| 3 | 4 | 0 | [model, love, u, take, u, time, urð, , , ±, ... |
| 4 | 5 | 0 | [factsguide, :, society, motivation] |
df['tweet'] = df['tweet'].apply(lambda w : [w for w in w if len(w) > 2])
df.head()
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | [father, dysfunctional, selfish, drags, kids, ... |
| 1 | 2 | 0 | [thanks, lyft, credit, can't, use, cause, offe... |
| 2 | 3 | 0 | [bihday, majesty] |
| 3 | 4 | 0 | [model, love, take, time, urð] |
| 4 | 5 | 0 | [factsguide, society, motivation] |
df['tweet'] = df['tweet'].apply(lambda w : [w for w in w if w.isalpha()])
df.head()
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | [father, dysfunctional, selfish, drags, kids, ... |
| 1 | 2 | 0 | [thanks, lyft, credit, use, cause, offer, whee... |
| 2 | 3 | 0 | [bihday, majesty] |
| 3 | 4 | 0 | [model, love, take, time, urð] |
| 4 | 5 | 0 | [factsguide, society, motivation] |
→ Use the counter and find the 10 most common terms.
words_freq = []
for sent in df['tweet']:
for w in sent:
words_freq.append(w)
counter=Counter(words_freq)
k1=[list(counter.keys()),list(counter.values())]
df_freq = pd.DataFrame(np.array(k1).T, columns=['Word','Count'])
df_freq['Count'] = pd.to_numeric(df_freq['Count'])
df_freq = df_freq.sort_values(by='Count',ascending=False)
df_freq_top20 = df_freq.head(20).reset_index(drop='index')
df_freq_top20.index = df_freq_top20.index + 1
df_freq_top20
| Word | Count | |
|---|---|---|
| 1 | love | 2748 |
| 2 | day | 2276 |
| 3 | happy | 1684 |
| 4 | time | 1131 |
| 5 | life | 1118 |
| 6 | like | 1047 |
| 7 | today | 1013 |
| 8 | new | 994 |
| 9 | thankful | 946 |
| 10 | positive | 931 |
| 11 | get | 917 |
| 12 | good | 862 |
| 13 | people | 859 |
| 14 | bihday | 844 |
| 15 | one | 783 |
| 16 | see | 762 |
| 17 | smile | 712 |
| 18 | want | 649 |
| 19 | take | 621 |
| 20 | work | 610 |
fig = px.bar(df_freq_top20, x='Word', y='Count',title='Top 20 terms in tweet')
fig.show()
→ Join the tokens back to form strings. This will be required for the vectorizers.
→ Assign x and y.
→ Perform train_test_split using sklearn.
df['tweet'] = df['tweet'].apply(lambda x: ' '.join(x))
df.head()
| id | label | tweet | |
|---|---|---|---|
| 0 | 1 | 0 | father dysfunctional selfish drags kids dysfun... |
| 1 | 2 | 0 | thanks lyft credit use cause offer wheelchair ... |
| 2 | 3 | 0 | bihday majesty |
| 3 | 4 | 0 | model love take time urð |
| 4 | 5 | 0 | factsguide society motivation |
X = df['tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
(23971,) (7991,) (23971,) (7991,)
px.histogram(y,title = 'Count Plot of lable Non-Hate(0) and Hate(1)')
df.isnull().sum()
id 0 label 0 tweet 0 dtype: int64
Import TF-IDF vectorizer from sklearn.
Instantiate with a maximum of 5000 terms in your vocabulary.
Fit and apply on the train set.
Apply on the test set.
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
X_train.shape, X_test.shape
((23971, 5000), (7991, 5000))
Instantiate Logistic Regression from sklearn with default parameters.
Fit into the train data.
Make predictions for the train and the test set.
LRmodel = LogisticRegression()
LRmodel.fit(X_train,y_train)
train_predictions = LRmodel.predict(X_train)
test_predictions = LRmodel.predict(X_test)
Report the accuracy on the train set.
Report the recall on the train set: decent, high, or low.
Get the f1 score on the train set.
print('Accuracy Score on training set %.3f' %(accuracy_score(y_train,train_predictions)*100),'%')
print('Accuracy Score on test set %.3f' %(accuracy_score(y_test,test_predictions)*100),'%')
Accuracy Score on training set 95.503 % Accuracy Score on test set 95.032 %
print('Classification Report Training set')
print('------------------------------------------------------')
print(classification_report(y_train,train_predictions))
Classification Report Training set
------------------------------------------------------
precision recall f1-score support
0 0.96 1.00 0.98 22288
1 0.95 0.38 0.54 1683
accuracy 0.96 23971
macro avg 0.95 0.69 0.76 23971
weighted avg 0.96 0.96 0.95 23971
print('Classification Report Testing set')
print('------------------------------------------------------')
print(classification_report(y_test,test_predictions))
Classification Report Testing set
------------------------------------------------------
precision recall f1-score support
0 0.95 1.00 0.97 7432
1 0.91 0.32 0.48 559
accuracy 0.95 7991
macro avg 0.93 0.66 0.72 7991
weighted avg 0.95 0.95 0.94 7991
Adjust the appropriate class in the LogisticRegression model.
weights = {0:1.0,1:13.0}
LRmodel = LogisticRegression(solver='lbfgs',class_weight=weights)
Train the model on the train set.
Evaluate the predictions on the train set: accuracy, recall, and f_1 score.
#fit and predict
LRmodel.fit(X_train,y_train)
train_predictions = LRmodel.predict(X_train)
test_predictions = LRmodel.predict(X_test)
#classification report
print('Classification Report Training set')
print('------------------------------------------------------')
print(classification_report(y_train,train_predictions))
print('\n')
print('Classification Report Testing set')
print('------------------------------------------------------')
print(classification_report(y_test,test_predictions))
Classification Report Training set
------------------------------------------------------
precision recall f1-score support
0 1.00 0.95 0.97 22288
1 0.61 0.98 0.75 1683
accuracy 0.95 23971
macro avg 0.80 0.97 0.86 23971
weighted avg 0.97 0.95 0.96 23971
Classification Report Testing set
------------------------------------------------------
precision recall f1-score support
0 0.98 0.94 0.96 7432
1 0.48 0.77 0.59 559
accuracy 0.93 7991
macro avg 0.73 0.85 0.78 7991
weighted avg 0.95 0.93 0.93 7991
Import GridSearch and StratifiedKFold because of class imbalance.
Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.
Use a balanced class weight while instantiating the logistic regression.
from scipy.stats import loguniform
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)
space
{'solver': ['newton-cg', 'lbfgs', 'liblinear'],
'penalty': ['l1', 'l2', 'elasticnet'],
'C': <scipy.stats._distn_infrastructure.rv_frozen at 0x224dd298e80>}
Choose ‘recall’ as the metric for scoring.
Choose stratified 4 fold cross validation scheme.
Fit into the train set.
import warnings
warnings.filterwarnings("ignore")
weights = {0:1.0,1:1.0}
LRmodel = LogisticRegression(class_weight=weights)
folds = StratifiedKFold(n_splits=4)
grid_search = RandomizedSearchCV(estimator=LRmodel,param_distributions=space, n_iter=200, scoring='recall',
n_jobs=-1, cv=folds)
grid_result = grid_search.fit(X_train,y_train)
grid_result.best_estimator_
LogisticRegression(C=74.03525251914729, class_weight={0: 1.0, 1: 1.0},
penalty='l1', solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. LogisticRegression(C=74.03525251914729, class_weight={0: 1.0, 1: 1.0},
penalty='l1', solver='liblinear')Use the best estimator from the grid search to make predictions on the test set.
What is the recall on the test set for the toxic comments?
What is the f_1 score?
`
LRmodel = LogisticRegression(C=74.03525251914729,penalty='l1',solver='liblinear',class_weight=weights)
LRmodel.fit(X_train,y_train)
train_predictions = LRmodel.predict(X_train)
test_predictions = LRmodel.predict(X_test)
#classification report
print('Classification Report Training set')
print('------------------------------------------------------')
print(classification_report(y_train,train_predictions))
print('Classification Report Testing set')
print('------------------------------------------------------')
print(classification_report(y_test,test_predictions))
Classification Report Training set
------------------------------------------------------
precision recall f1-score support
0 1.00 1.00 1.00 22288
1 0.99 0.97 0.98 1683
accuracy 1.00 23971
macro avg 0.99 0.98 0.99 23971
weighted avg 1.00 1.00 1.00 23971
Classification Report Testing set
------------------------------------------------------
precision recall f1-score support
0 0.97 0.96 0.97 7432
1 0.56 0.62 0.59 559
accuracy 0.94 7991
macro avg 0.76 0.79 0.78 7991
weighted avg 0.94 0.94 0.94 7991
disp = ConfusionMatrixDisplay(confusion_matrix(y_train, train_predictions, labels=LRmodel.classes_))
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x224dd421250>
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, test_predictions, labels=LRmodel.classes_))
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x224de86eca0>